# -*- coding: utf-8 -*-
from __future__ import division
import sys, argparse
import csv
import time as t
import numpy as np
import datetime as d
import pandas as pd
'''
衡量用户对该门课的投入程度
用户上这门课的event占用户最近所有event的比例
最近N天
'''
def parse_args():
    if len(sys.argv) == 1:
        sys.argv.append('-h')

    parser = argparse.ArgumentParser()
    # 距离现在多少天
    parser.add_argument('n_days')
    parser.add_argument('enrollment_path')
    parser.add_argument('enrollment_time_path')
    parser.add_argument('output')
    args = vars(parser.parse_args())
    return args

args = parse_args()

N = int(args['n_days'])
enrollment_path = args['enrollment_path']
time_path = args['enrollment_time_path']
output = args['output']

time_format = "%Y-%m-%dT%H:%M:%S"

time = pd.read_csv(time_path)
enrollment = pd.read_csv(enrollment_path)
time = time.set_index('enrollment_id')
enrollment = enrollment.set_index('enrollment_id')
merged_data = pd.concat([enrollment, time], join='inner', axis=1)
merged_data['enrollment_id'] = merged_data.index



out_data = pd.DataFrame()
out_data['enrollment_id'] = merged_data.index
out_data = out_data.set_index('enrollment_id')

# get user's enrollment
user_enrollmentids = {}
for username, enrollment_id in merged_data[ ["username", "enrollment_id"]].values:
    if username not in user_enrollmentids:
        user_enrollmentids[username] = []
    user_enrollmentids[username].append(enrollment_id)


events = ["overall", "problem", "video", "access", "wiki", "discussion", "nagivate", "page_close"]

def cal_latest_N(rcd, N):
    if N == -1:
        return len(rcd.split())
    dates = [d.datetime.strptime(date, time_format) for date in rcd.split()]
    latest_date = dates[-1]
    count = 0
    for date in dates:
        if (latest_date - date).days <= N:
            count += 1
    return count

out_data = pd.DataFrame()
out_data["enrollment_id"] = merged_data.enrollment_id
out_data = out_data.set_index('enrollment_id')

for event in events:
    key = "event_%s_date" % event
    out_key = key + "_count.latest.%d" % N
    out_data[out_key] = getattr(merged_data, key).map(lambda x: cal_latest_N(x, N), na_action="ignore")


print 'get user total count ...'
user_event_total_count = {}
for event in events:
    if event not in user_event_total_count:
        user_event_total_count[event] = {}
    for username, enrollment_ids in user_enrollmentids.items():
        if username not in user_event_total_count[event]:
            user_event_total_count[event][username] = 0
        count = 0
        key = "event_%s_date" % event
        out_key = key + "_count.latest.%d" % N
        for enrollment_id in enrollment_ids:
            c = out_data.ix[enrollment_id][out_key]
            count += c if c else 0
        user_event_total_count[event][username] = count

def get_ratio(rcd, username, event):
    key = "event_%s_date"
    out_key = key + "_count.latest.%d" % N
    return rcd / user_event_total_count[event][username]

out_data['enrollment_id'] = out_data.index
out_data['username'] = merged_data['username']

for event in events:
    key = "event_%s_date" % event
    out_key = key + "_count.latest.%d" % N
    outout_key = "user_course_event_%s_count_ratio" % event
    data = []
    ratio = None
    for count, username in out_data[ [out_key, 'username']].values:
        if count != None:
            ratio = count / user_event_total_count[event][username]
        data.append(ratio)
    out_data[outout_key] = data

outout_data = pd.DataFrame()
outout_data['enrollment_id'] = getattr(out_data, 'enrollment_id')
outout_data['username'] = getattr(out_data, 'username')
for event in events:
    key = "event_%s_date" % event
    out_key = key + "_count.latest.%d" % N
    outout_key = "user_course_event_%s_count_ratio.%d" % (event, N)
    outout_data[outout_key] = getattr(out_data, out_key)

print outout_data.head()
print 'output to ', output
outout_data.to_csv(output, index=False)
